This article provides the steps to create population estimates for 5-year age group and sex combinations from 1995 to 2000 at LGA-level (Australia | 2016 and 2021 boundaries).
| Data Property | Specifications |
|---|---|
| Age | 5-year groups |
| Sex | Male, female, persons |
| Spatial scope | Australia |
| Spatial resolution | LGA |
| Temporal scope | 2001 - 2023 |
| Temporal resolution | Yearly |
| Reference | Australian Bureau of Statistics. (2023). Regional population by age and sex. ABS. https://www.abs.gov.au/statistics/people/population/regional-population-age-and-sex/2023. |
# Load data
erp_lga_2001_2023_asgs2021_m <- as.data.table(read_excel("ABS data/erp_lga_2001_2023_asgs2021.xlsx",
sheet = "Table 1", skip = 6))
erp_lga_2001_2023_asgs2021_f <- as.data.table(read_excel("ABS data/erp_lga_2001_2023_asgs2021.xlsx",
sheet = "Table 2", skip = 6))
erp_lga_2001_2023_asgs2021_p <- as.data.table(read_excel("ABS data/erp_lga_2001_2023_asgs2021.xlsx",
sheet = "Table 3", skip = 6))
# Rename LGA code and LGA name to LGA_CODE and LGA_NAME for consistency with other boundary datasets
setnames(erp_lga_2001_2023_asgs2021_m, old = "LGA code", new = "LGA_CODE")
setnames(erp_lga_2001_2023_asgs2021_f, old = "LGA code", new = "LGA_CODE")
setnames(erp_lga_2001_2023_asgs2021_p, old = "LGA code", new = "LGA_CODE")
setnames(erp_lga_2001_2023_asgs2021_m, old = "LGA name", new = "LGA_NAME")
setnames(erp_lga_2001_2023_asgs2021_f, old = "LGA name", new = "LGA_NAME")
setnames(erp_lga_2001_2023_asgs2021_p, old = "LGA name", new = "LGA_NAME")
# Remove columns not needed
rem_cols <- c("S/T code", "S/T name", "LGA_CODE")
# Checked that no LGA_NAMEs are getting duplicated
# anyDuplicated(erp_lga_2001_2023_asgs2021_f[1:2288,]$LGA_NAME)
erp_lga_2001_2023_asgs2021_m[, (rem_cols) := NULL]
erp_lga_2001_2023_asgs2021_f[, (rem_cols) := NULL]
erp_lga_2001_2023_asgs2021_p[, (rem_cols) := NULL]
rm(rem_cols)
# Age groups
setnames(erp_lga_2001_2023_asgs2021_m,
c("0-4", "5-9", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85 and over", "Total males"),
c("m_00_04", "m_05_09", "m_10_14", "m_15_19", "m_20_24", "m_25_29", "m_30_34", "m_35_39", "m_40_44", "m_45_49", "m_50_54", "m_55_59", "m_60_64", "m_65_69", "m_70_74", "m_75_79", "m_80_84", "m_85_99", "m_all"))
setnames(erp_lga_2001_2023_asgs2021_f,
c("0-4", "5-9", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85 and over", "Total females"),
c("f_00_04", "f_05_09", "f_10_14", "f_15_19", "f_20_24", "f_25_29", "f_30_34", "f_35_39", "f_40_44", "f_45_49", "f_50_54", "f_55_59", "f_60_64", "f_65_69", "f_70_74", "f_75_79", "f_80_84", "f_85_99", "f_all"))
setnames(erp_lga_2001_2023_asgs2021_p,
c("0-4", "5-9", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85 and over", "Total persons"),
c("p_00_04", "p_05_09", "p_10_14", "p_15_19", "p_20_24", "p_25_29", "p_30_34", "p_35_39", "p_40_44", "p_45_49", "p_50_54", "p_55_59", "p_60_64", "p_65_69", "p_70_74", "p_75_79", "p_80_84", "p_85_99", "p_all"))
# Combine all sexes
erp_lga_2001_2023_asgs2021 <- merge(erp_lga_2001_2023_asgs2021_m, erp_lga_2001_2023_asgs2021_f, by = c("Year", "LGA_NAME"))
erp_lga_2001_2023_asgs2021 <- merge(erp_lga_2001_2023_asgs2021, erp_lga_2001_2023_asgs2021_p, by = c("Year", "LGA_NAME"))
erp_lga_2001_2023_asgs2021 <- erp_lga_2001_2023_asgs2021[!is.na(Year)] # Remove NA row
rm(erp_lga_2001_2023_asgs2021_m, erp_lga_2001_2023_asgs2021_f, erp_lga_2001_2023_asgs2021_p)
We will calculate compound annual growth rate for backcasting (opposite of forecasting) based on how population changed from its starting point to end point and considering how many years it took.
https://www.investopedia.com/terms/g/growthrates.asp provides good explanation of this.
The formula for compound annual growth rate is: [(start_value / end_value) ^ 1/years] - 1
# Reshape the data to long format
erp_long <- melt(erp_lga_2001_2023_asgs2021,
id.vars = c("Year", "LGA_NAME"),
variable.name = "Group",
value.name = "Population")
years_to_predict <- 1995:2000
## Growth rate backcasting ##
# Function for growth rate backcasting
extrapolate_backcasting <- function(data, years_to_predict) {
# Years since start to consider for growth rate
n_years <- 1
# 1 year performed best against state level ERP (done in later steps)
# Calculate the annualized growth rate
start_pop <- data$Population[data$Year == min(data$Year)]
end_pop <- data$Population[data$Year == (min(data$Year) + n_years)]
# Handle cases where start_pop is 0 (to avoid division by zero)
if (start_pop == 0) {
return(data.frame(Year = years_to_predict, Population = 0))
}
# Calculate growth rate
growth_rate <- ((end_pop / start_pop)^(1 / n_years)) - 1
# If negative growth rate, reverse the equation to ensure past predictions are higher
# Negative growth rates can overestiamte populations of past years, so we will reduce it by a third
if (growth_rate < 0) {
predictions <- sapply(years_to_predict, function(year) {
predicted_pop <- start_pop * (1 + abs(growth_rate/3))^(min(data$Year) - year)
# Ensure population is not negative or NaN
if (is.na(predicted_pop) || predicted_pop < 0) {
predicted_pop <- 0
}
# Round to the nearest integer
predicted_pop <- round(predicted_pop)
return(predicted_pop)
})
} else {
# If starting population is less than or equal to ending population (positive or zero growth),
# use the original formula
predictions <- sapply(years_to_predict, function(year) {
predicted_pop <- start_pop / (1 + growth_rate)^(min(data$Year) - year)
# Ensure population is not negative or NaN
if (is.na(predicted_pop) || predicted_pop < 0) {
predicted_pop <- 0
}
# Round to the nearest integer
predicted_pop <- round(predicted_pop)
return(predicted_pop)
})
}
return(data.frame(Year = years_to_predict, Population = predictions))
}
# Apply growth rate backcasting to each group
backcasting_data <- erp_long %>%
group_by(`LGA_NAME`, Group) %>%
do(extrapolate_backcasting(., years_to_predict)) %>%
ungroup()
# Combine with original data
full_data_backcasting <- bind_rows(erp_long, backcasting_data)
# Reshape back to wide format
erp_backcasting <- dcast(full_data_backcasting, Year + `LGA_NAME` ~ Group, value.var = "Population")
# Sort the final data
setorder(erp_backcasting, Year, `LGA_NAME`)
# Clean up environment
rm(erp_long, years_to_predict, backcasting_data, full_data_backcasting, extrapolate_backcasting)
dt <- copy(erp_backcasting)
# Subset data for 20 random LGA_NAMEs
set.seed(123)
lga_names <- sample(unique(dt$`LGA_NAME`), 20)
dt <- dt[`LGA_NAME` %in% lga_names]
plot_ly(
data = dt,
x = ~Year,
# y = ~m_all,
# y = ~f_all,
y = ~p_all,
color = ~`LGA_NAME`,
type = 'scatter',
mode = 'lines',
hoverinfo = 'text',
text = ~paste("Year:", Year
, "<br>LGA:", `LGA_NAME`
, "<br>p_all:", p_all)
) %>%
layout(
title = "Australia LGA-level population estimates (persons) for 1995-2023",
xaxis = list(title = "Year"),
yaxis = list(title = "p_all"),
legend = list(title = list(text = "LGA_NAME"))
) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'Australia LGA-level population estimates (persons) for 1995-2023'))
# Aggregate by year column and sum all columns disregarding LGA_NAME
dt_agg <- copy(erp_backcasting)
dt_agg <- dt_agg[, `LGA_NAME` := NULL]
dt_agg <- dt_agg[, lapply(.SD, sum), by = Year]
dt_agg <- melt(dt_agg, id.vars = c("Year"), variable.name = "Category", value.name = "Count")
plot_ly(
# data = dt,
data = dt_agg,
x = ~Year,
# y = ~m_all,
# y = ~f_all,
y = ~Count,
color = ~Category,
type = 'scatter',
mode = 'lines', # Line plot
hoverinfo = 'text',
text = ~paste("Year:", Year
# , "<br>LGA:", `LGA_NAME`
, "<br>Count:", Count)
) %>%
layout(
title = "Population Over Time by LGA",
xaxis = list(title = "Year"),
yaxis = list(title = "Count"),
legend = list(title = list(text = "LGA_NAME"))
) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'Australia population estimates by age and sex for 1995-2023'))
# Remove temporary objects
rm(dt, dt_agg, lga_names, p)
| Data Property | Specifications |
|---|---|
| Age | 1-year |
| Sex | Male, female, persons |
| Spatial scope | Australia |
| Spatial resolution | State |
| Temporal scope | 1971 - 2024 |
| Temporal resolution | Yearly |
| Reference | Australian Bureau of Statistics. (2024, June). National, state and territory population. ABS. https://www.abs.gov.au/statistics/people/population/national-state-and-territory-population/jun-2024. |
We can use these state-level ABS estimates to adjust our LGA-level extrapolations
# New South Wales
erp_state_1971_2024_nsw_1 <-
as.data.table(read_excel("ABS data/TABLE 51. Estimated Resident Population By Single Year Of Age, New South Wales.xlsx",
sheet = "Data1"))
erp_state_1971_2024_nsw_2 <-
as.data.table(read_excel("ABS data/TABLE 51. Estimated Resident Population By Single Year Of Age, New South Wales.xlsx",
sheet = "Data2"))
# Merge columns two datasets by match on first column
erp_state_1971_2024_nsw <- merge(erp_state_1971_2024_nsw_1, erp_state_1971_2024_nsw_2,
by = names(erp_state_1971_2024_nsw_1)[1], all = TRUE)
# Add a column of STATE
erp_state_1971_2024_nsw[, STATE := "New South Wales"]
# Delete the last 9 rows
erp_state_1971_2024_nsw <- erp_state_1971_2024_nsw[1:(dim(erp_state_1971_2024_nsw)[1] - 9)]
# Remove individual objects
rm(erp_state_1971_2024_nsw_1, erp_state_1971_2024_nsw_2)
# Victoria
erp_state_1971_2024_vic_1 <-
as.data.table(read_excel("ABS data/TABLE 52. Estimated Resident Population By Single Year Of Age, Victoria.xlsx",
sheet = "Data1"))
erp_state_1971_2024_vic_2 <-
as.data.table(read_excel("ABS data/TABLE 52. Estimated Resident Population By Single Year Of Age, Victoria.xlsx",
sheet = "Data2"))
# Merge columns two datasets by match on first column
erp_state_1971_2024_vic <- merge(erp_state_1971_2024_vic_1, erp_state_1971_2024_vic_2,
by = names(erp_state_1971_2024_vic_1)[1], all = TRUE)
# Add a column of STATE
erp_state_1971_2024_vic[, STATE := "Victoria"]
# Delete the last 9 rows
erp_state_1971_2024_vic <- erp_state_1971_2024_vic[1:(dim(erp_state_1971_2024_vic)[1] - 9)]
# Remove individual objects
rm(erp_state_1971_2024_vic_1, erp_state_1971_2024_vic_2)
# Queensland
erp_state_1971_2024_qld_1 <-
as.data.table(read_excel("ABS data/TABLE 53. Estimated Resident Population By Single Year Of Age, Queensland.xlsx",
sheet = "Data1"))
erp_state_1971_2024_qld_2 <-
as.data.table(read_excel("ABS data/TABLE 53. Estimated Resident Population By Single Year Of Age, Queensland.xlsx",
sheet = "Data2"))
# Merge columns two datasets by match on first column
erp_state_1971_2024_qld <- merge(erp_state_1971_2024_qld_1, erp_state_1971_2024_qld_2,
by = names(erp_state_1971_2024_qld_1)[1], all = TRUE)
# Add a column of STATE
erp_state_1971_2024_qld[, STATE := "Queensland"]
# Delete the last 9 rows
erp_state_1971_2024_qld <- erp_state_1971_2024_qld[1:(dim(erp_state_1971_2024_qld)[1] - 9)]
# Remove individual objects
rm(erp_state_1971_2024_qld_1, erp_state_1971_2024_qld_2)
# South Australia
erp_state_1971_2024_sa_1 <-
as.data.table(read_excel("ABS data/TABLE 54. Estimated Resident Population By Single Year Of Age, South Australia.xlsx",
sheet = "Data1"))
erp_state_1971_2024_sa_2 <-
as.data.table(read_excel("ABS data/TABLE 54. Estimated Resident Population By Single Year Of Age, South Australia.xlsx",
sheet = "Data2"))
# Merge columns two datasets by match on first column
erp_state_1971_2024_sa <- merge(erp_state_1971_2024_sa_1, erp_state_1971_2024_sa_2,
by = names(erp_state_1971_2024_sa_1)[1], all = TRUE)
# Add a column of STATE
erp_state_1971_2024_sa[, STATE := "South Australia"]
# Delete the last 9 rows
erp_state_1971_2024_sa <- erp_state_1971_2024_sa[1:(dim(erp_state_1971_2024_sa)[1] - 9)]
# Remove individual objects
rm(erp_state_1971_2024_sa_1, erp_state_1971_2024_sa_2)
# Western Australia
erp_state_1971_2024_wa_1 <-
as.data.table(read_excel("ABS data/TABLE 55. Estimated Resident Population By Single Year Of Age, Western Australia.xlsx",
sheet = "Data1"))
erp_state_1971_2024_wa_2 <-
as.data.table(read_excel("ABS data/TABLE 55. Estimated Resident Population By Single Year Of Age, Western Australia.xlsx",
sheet = "Data2"))
# Merge columns two datasets by match on first column
erp_state_1971_2024_wa <- merge(erp_state_1971_2024_wa_1, erp_state_1971_2024_wa_2,
by = names(erp_state_1971_2024_wa_1)[1], all = TRUE)
# Add a column of STATE
erp_state_1971_2024_wa[, STATE := "Western Australia"]
# Delete the last 9 rows
erp_state_1971_2024_wa <- erp_state_1971_2024_wa[1:(dim(erp_state_1971_2024_wa)[1] - 9)]
# Remove individual objects
rm(erp_state_1971_2024_wa_1, erp_state_1971_2024_wa_2)
# Tasmania
erp_state_1971_2024_tas_1 <-
as.data.table(read_excel("ABS data/TABLE 56. Estimated Resident Population By Single Year Of Age, Tasmania.xlsx",
sheet = "Data1"))
erp_state_1971_2024_tas_2 <-
as.data.table(read_excel("ABS data/TABLE 56. Estimated Resident Population By Single Year Of Age, Tasmania.xlsx",
sheet = "Data2"))
# Merge columns two datasets by match on first column
erp_state_1971_2024_tas <- merge(erp_state_1971_2024_tas_1, erp_state_1971_2024_tas_2,
by = names(erp_state_1971_2024_tas_1)[1], all = TRUE)
# Add a column of STATE
erp_state_1971_2024_tas[, STATE := "Tasmania"]
# Delete the last 9 rows
erp_state_1971_2024_tas <- erp_state_1971_2024_tas[1:(dim(erp_state_1971_2024_tas)[1] - 9)]
# Remove individual objects
rm(erp_state_1971_2024_tas_1, erp_state_1971_2024_tas_2)
# Northern Territory
erp_state_1971_2024_nt_1 <-
as.data.table(read_excel("ABS data/TABLE 57. Estimated Resident Population By Single Year Of Age, Northern Territory.xlsx",
sheet = "Data1"))
erp_state_1971_2024_nt_2 <-
as.data.table(read_excel("ABS data/TABLE 57. Estimated Resident Population By Single Year Of Age, Northern Territory.xlsx",
sheet = "Data2"))
# Merge columns two datasets by match on first column
erp_state_1971_2024_nt <- merge(erp_state_1971_2024_nt_1, erp_state_1971_2024_nt_2,
by = names(erp_state_1971_2024_nt_1)[1], all = TRUE)
# Add a column of STATE
erp_state_1971_2024_nt[, STATE := "Northern Territory"]
# Delete the last 9 rows
erp_state_1971_2024_nt <- erp_state_1971_2024_nt[1:(dim(erp_state_1971_2024_nt)[1] - 9)]
# Remove individual objects
rm(erp_state_1971_2024_nt_1, erp_state_1971_2024_nt_2)
# Australian Capital Territory
erp_state_1971_2024_act_1 <-
as.data.table(read_excel("ABS data/TABLE 58. Estimated Resident Population By Single Year Of Age, Australian Capital Territory.xlsx",
sheet = "Data1"))
erp_state_1971_2024_act_2 <-
as.data.table(read_excel("ABS data/TABLE 58. Estimated Resident Population By Single Year Of Age, Australian Capital Territory.xlsx",
sheet = "Data2"))
# Merge columns two datasets by match on first column
erp_state_1971_2024_act <- merge(erp_state_1971_2024_act_1, erp_state_1971_2024_act_2,
by = names(erp_state_1971_2024_act_1)[1], all = TRUE)
# Add a column of STATE
erp_state_1971_2024_act[, STATE := "Australian Capital Territory"]
# Delete the last 9 rows
erp_state_1971_2024_act <- erp_state_1971_2024_act[1:(dim(erp_state_1971_2024_act)[1] - 9)]
# Remove individual objects
rm(erp_state_1971_2024_act_1, erp_state_1971_2024_act_2)
# rbind all datasets
erp_state_1971_2024 <- rbind(erp_state_1971_2024_nsw
, erp_state_1971_2024_vic
, erp_state_1971_2024_qld
, erp_state_1971_2024_sa
, erp_state_1971_2024_wa
, erp_state_1971_2024_tas
, erp_state_1971_2024_nt
, erp_state_1971_2024_act)
# Remove individual objects
rm(erp_state_1971_2024_nsw
, erp_state_1971_2024_vic
, erp_state_1971_2024_qld
, erp_state_1971_2024_sa
, erp_state_1971_2024_wa
, erp_state_1971_2024_tas
, erp_state_1971_2024_nt
, erp_state_1971_2024_act)
# Change name of the first column to Year
setnames(erp_state_1971_2024,
names(erp_state_1971_2024)[1],
"Year")
# Format the Year column
erp_state_1971_2024[, Year := format(as.Date(as.numeric(Year), origin = "1899-12-30"), "%Y")]
# Remove all rows where Year is below 1995 or above 2023
erp_state_1995_2023 <- copy(erp_state_1971_2024)
erp_state_1995_2023 <- erp_state_1971_2024[Year >= 1995]
erp_state_1995_2023 <- erp_state_1995_2023[Year <= 2023]
# Function to clean and rename columns
convert_colnames <- function(names_vector) {
sapply(names_vector, function(name) {
# Remove any leading/trailing spaces
name <- str_trim(name)
# Ensure the column follows the expected pattern
if (!grepl("^Estimated Resident Population ;", name)) return(name)
# Split by " ; " while handling extra spaces
parts <- unlist(str_split(name, "\\s*;\\s*"))
# Ensure correct splitting
if (length(parts) < 3) return(name) # Skip unexpected formats
gender <- parts[2] # Extract gender
age <- parts[3] # Extract age
# Convert gender to lowercase initial (m/f/p)
gender_initial <- tolower(substr(gender, 1, 1))
# Process age values
age <- str_replace_all(age, " ", "_") # Replace spaces with underscores
# age <- ifelse(str_detect(age, "^[0-9]$"), paste0("0", age), age) # Add leading zero to single digits
age <- str_replace(age, "100_and_over", "100") # Ensure "100_and_over" is correct
# Return final column name
paste0(gender_initial, "_", age)
}, USE.NAMES = FALSE)
}
# Apply function to rename columns
setnames(erp_state_1995_2023,
names(erp_state_1995_2023),
convert_colnames(names(erp_state_1995_2023)))
# Convert all columns to numeric except STATE column
cols_to_convert <- setdiff(names(erp_state_1995_2023), "STATE")
erp_state_1995_2023[, (cols_to_convert) := lapply(.SD, as.numeric), .SDcols = cols_to_convert]
# Form age group categories similar to lga data
# Define age groupings
age_groups <- list(
"00_04" = 0:4, "05_09" = 5:9, "10_14" = 10:14, "15_19" = 15:19,
"20_24" = 20:24, "25_29" = 25:29, "30_34" = 30:34, "35_39" = 35:39,
"40_44" = 40:44, "45_49" = 45:49, "50_54" = 50:54, "55_59" = 55:59,
"60_64" = 60:64, "65_69" = 65:69, "70_74" = 70:74, "75_79" = 75:79,
"80_84" = 80:84, "85_99" = 85:100
)
sexes <- c("m", "f", "p")
# Loop over sexes and age_groups to compute aggregates
for (sex in sexes) {
for (age in names(age_groups)) {
cols <- paste0(sex, "_", age_groups[[age]])
erp_state_1995_2023[, paste0(sex, "_", age) := rowSums(.SD), .SDcols = cols]
}
# Also calculate _all
all_cols <- paste0(sex, "_", 0:100)
erp_state_1995_2023[, paste0(sex, "_all") := rowSums(.SD), .SDcols = all_cols]
}
# Remove individual age groups
erp_state_1995_2023[, c(paste0("m_", 0:100), paste0("f_", 0:100), paste0("p_", 0:100)) := NULL]
# Remove temporary objects
rm(convert_colnames, cols_to_convert, age_groups, age, sexes, sex, cols, all_cols)
# Plot the differences in these two datasets for all years, age groups and sex
erp_backcasting_agg <- copy(erp_backcasting)
erp_backcasting_agg <- erp_backcasting_agg[, `LGA_NAME` := NULL]
erp_backcasting_agg <- erp_backcasting_agg[, lapply(.SD, sum), by = Year]
erp_backcasting_agg <- melt(erp_backcasting_agg, id.vars = "Year", variable.name = "Category", value.name = "Value")
# add a colour column where value is #f00 when Category column value contains f and #0f0 when the value contains m and #00f when p
erp_backcasting_agg[, color := ifelse(grepl("f", Category), "#f00", ifelse(grepl("m", Category), "#00f", "#0f0"))]
# erp_backcasting_agg <- erp_backcasting_agg[color == "#0f0"]
erp_state_1995_2023_agg <- copy(erp_state_1995_2023)
erp_state_1995_2023_agg <- erp_state_1995_2023_agg[, STATE := NULL]
erp_state_1995_2023_agg <- erp_state_1995_2023_agg[, lapply(.SD, sum), by = Year]
erp_state_1995_2023_agg <- melt(erp_state_1995_2023_agg, id.vars = "Year", variable.name = "Category", value.name = "Value")
# add a colour column where value is #f00 when Category column value contains f and #0f0 when the value contains m and #00f when p
erp_state_1995_2023_agg[, color := ifelse(grepl("f", Category), "#f00", ifelse(grepl("m", Category), "#00f", "#0f0"))]
# erp_state_1995_2023_agg <- erp_state_1995_2023_agg[color == "#0f0"]
plot_ly(data = erp_backcasting_agg, x = ~Year, y = ~Value, color = ~Category,
line = list(color = adjust_transparency(erp_backcasting_agg$color, 0.5), width = 1),
type = "scatter", mode = "lines") %>%
add_trace(data = erp_state_1995_2023_agg, x = ~Year, y = ~Value, color = ~Category, line = list(color = adjust_transparency(erp_state_1995_2023_agg$color, 1), width = 2),
type = "scatter", mode = "lines") %>%
layout(title = "Trends Over Years",
xaxis = list(title = "Year"),
yaxis = list(title = "Count"))
# Checking all the ratios and making a heatmap
erp_backcasting_agg <- copy(erp_backcasting)
erp_backcasting_agg <- erp_backcasting_agg[, `LGA_NAME` := NULL]
erp_backcasting_agg <- erp_backcasting_agg[, lapply(.SD, sum), by = Year]
erp_state_1995_2023_agg <- copy(erp_state_1995_2023)
erp_state_1995_2023_agg <- erp_state_1995_2023_agg[, STATE := NULL]
erp_state_1995_2023_agg <- erp_state_1995_2023_agg[, lapply(.SD, sum), by = Year]
# Compute ratios (excluding the Year column)
ratio_table <- erp_state_1995_2023_agg[, lapply(.SD, as.numeric)] / erp_backcasting_agg[, lapply(.SD, as.numeric)]
ratio_table[, Year := erp_backcasting_agg$Year] # Restore Year column
# Convert to long format for ggplot
ratio_long <- melt(ratio_table, id.vars = "Year", variable.name = "Category", value.name = "Ratio")
# ratio_long <- ratio_long[Year <= 2000] # How well extrapolated estimates align with ABS state level estimates?
# Plot heatmap
ggplotly(ggplot(ratio_long, aes(x = Year, y = Category, fill = Ratio)) +
geom_tile() +
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 1) +
labs(title = "How well extrapolated estimates align with ABS state level estimates?",
x = "Year", y = "Category", fill = "Ratio") +
theme_minimal()) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'How well extrapolated estimates align with ABS state level estimates'))
ggplotly(ggplot(ratio_long, aes(x = Year, y = Ratio, color = Category)) +
geom_jitter(width = 0.2, alpha = 0.7) + # Adds small noise to prevent overlap
theme_minimal() +
labs(title = "Ratio of Australia state-level:LGA-level estimates for 1995-2023", y = "Ratio", x = "Year")) %>%
layout(yaxis = list(range = c(0.7, 1.3), tickvals = list(0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3), ticktext = list(0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3))) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'Ratio of Australia state-level:LGA-level estimates for 1995-2023'))
ratio_long <- melt(ratio_table, id.vars = "Year", variable.name = "Category", value.name = "Ratio")
ratio_long <- ratio_long[Year >= 2001] # How well ABS estimates align with each other?
ggplotly(ggplot(ratio_long, aes(x = Year, y = Category, fill = Ratio)) +
geom_tile() +
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 1) +
labs(title = "How well ABS estimates align with each other?", x = "Year", y = "Category", fill = "Ratio") +
theme_minimal()) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'How well ABS estimates align with each other'))
ggplotly(ggplot(ratio_long, aes(x = Year, y = Ratio, color = Category)) +
geom_jitter(width = 0.2, alpha = 0.7) + # Adds small noise to prevent overlap
theme_minimal() +
labs(title = "Ratio of Australia state-level:LGA-level estimates for 2001-2023 (zoomed)",
y = "Ratio", x = "Year")) %>%
# layout(yaxis = list(range = c(0.8, 1.1), tickvals = list(0.8, 0.9, 1, 1.1), ticktext = list(0.8, 0.9, 1, 1.1))) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'Ratio of Australia state-level:LGA-level estimates for 2001-2023 (zoomed)'))
rm(erp_backcasting_agg, erp_state_1995_2023_agg, ratio_table, ratio_long)
# Add a new column LGA_CODE to erp_backcasting using mapping with LGA_NAME column of erp_lga_2001_2023_asgs2021_m
erp_lga_2001_2023_asgs2021_m <- as.data.table(read_excel("ABS data/erp_lga_2001_2023_asgs2021.xlsx",
sheet = "Table 1", skip = 6))
setnames(erp_lga_2001_2023_asgs2021_m, old = "LGA code", new = "LGA_CODE")
setnames(erp_lga_2001_2023_asgs2021_m, old = "LGA name", new = "LGA_NAME")
erp_lga_2001_2023_asgs2021_m <- erp_lga_2001_2023_asgs2021_m[,.(LGA_CODE, LGA_NAME)]
erp_lga_2001_2023_asgs2021_m <- unique(erp_lga_2001_2023_asgs2021_m)
erp_backcasting <- merge(erp_backcasting, erp_lga_2001_2023_asgs2021_m, by = "LGA_NAME", all.x = TRUE)
rm(erp_lga_2001_2023_asgs2021_m)
# Form STATE column using LGA_CODE
erp_backcasting[, `LGA_CODE_1` := substr(LGA_CODE, 1, 1)]
erp_backcasting[, `LGA_CODE_1` := as.numeric(LGA_CODE_1)]
erp_backcasting <- erp_backcasting[LGA_CODE_1 != 9] # Removing all Other Territories as their data is not available from state level
erp_backcasting[, STATE := fcase(LGA_CODE_1 == 1, "New South Wales",
LGA_CODE_1 == 2, "Victoria",
LGA_CODE_1 == 3, "Queensland",
LGA_CODE_1 == 4, "South Australia",
LGA_CODE_1 == 5, "Western Australia",
LGA_CODE_1 == 6, "Tasmania",
LGA_CODE_1 == 7, "Northern Territory",
LGA_CODE_1 == 8, "Australian Capital Territory")]
erp_backcasting[,LGA_CODE := NULL]
erp_backcasting[,LGA_CODE_1 := NULL]
# Form columns that contains sums of age group column values for each year and STATE column without considering LGA_NAME column
# Summarise by Year and STATE
# Select only the required numeric columns
numeric_cols <- grep("^m_|^f_|^p_", names(erp_backcasting), value = TRUE)
# Perform aggregation
erp_summarised <- erp_backcasting[, lapply(.SD, sum, na.rm = TRUE), by =
.(Year, STATE), .SDcols = numeric_cols]
# Rename columns by appending "_sum"
setnames(erp_summarised, old = names(erp_summarised)[-c(1,2)],
new = paste0(names(erp_summarised)[-c(1,2)], "_sum"))
# Merge erp_summarised with erp_backcasting based on Year and STATE columns
erp_backcasting <- merge(erp_backcasting, erp_summarised, by = c("Year", "STATE"), all.x = TRUE)
# Rename columns in state by appending "_abs"
setnames(erp_state_1995_2023, old = names(erp_state_1995_2023)[-c(1,2)],
new = paste0(names(erp_state_1995_2023)[-c(1,2)], "_abs"))
# Merge erp_state_1995_2023 with erp_backcasting based on Year and STATE columns
erp_backcasting <- merge(erp_backcasting, erp_state_1995_2023, by = c("Year", "STATE"), all.x = TRUE)
# Identify the sum and abs columns
sum_cols <- grep("_sum$", names(erp_backcasting), value = TRUE)
abs_cols <- sub("_sum$", "_abs", sum_cols) # Get corresponding _abs columns
# Ensure all abs_cols exist before dividing
valid_cols <- abs_cols %in% names(erp_backcasting)
# Create ratio columns
for (i in which(valid_cols)) {
ratio_col <- sub("_sum$", "_ratio", sum_cols[i]) # Define new column name
erp_backcasting[, (ratio_col) := get(abs_cols[i]) / get(sum_cols[i])]
}
# Make all _ratio column values 1 for Year column values 1995:2000
# Identify all _ratio columns
ratio_cols <- grep("_ratio$", names(erp_backcasting), value = TRUE)
# Set _ratio values to 1 for Year between 1995 and 2000
erp_backcasting[Year <= 2000][, (ratio_cols) := 1]
# Visualise the ratios
# make a new dt with only c("Year", "STATE", ratio_cols) columns of erp_backcasting
ratio_table <- erp_backcasting[, c("Year", "STATE", ratio_cols), with = FALSE]
ratio_table <- unique(ratio_table)
# Convert to long format for ggplot
ratio_long <- melt(ratio_table, id.vars = c("Year", "STATE"), variable.name = "Category", value.name = "Ratio")
# ratio_long <- ratio_long[STATE == "New South Wales"]
# ratio_long <- ratio_long[STATE == "Victoria"]
ratio_long <- ratio_long[STATE == "Queensland"]
# ratio_long <- ratio_long[STATE == "South Australia"]
# ratio_long <- ratio_long[STATE == "Western Australia"]
# ratio_long <- ratio_long[STATE == "Tasmania"]
# ratio_long <- ratio_long[STATE == "Northern Territory"]
# ratio_long <- ratio_long[STATE == "Australian Capital Territory"]
# Plot heatmap
ggplotly(ggplot(ratio_long, aes(x = Year, y = Category, fill = Ratio)) +
geom_tile() +
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 1) +
labs(title = "Heatmap of Ratios after adjustment (non-round numbers)",
x = "Year", y = "Category", fill = "Ratio") +
theme_minimal()) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'Heatmap of Ratios after adjustment (non-round numbers)'))
# Multiply numeric_cols with their respective _ratio columns to form _adjusted columns
adjusted_cols <- sub("_ratio$", "_adjusted", ratio_cols)
erp_backcasting[, (adjusted_cols) := Map(`*`, .SD, mget(ratio_cols)), .SDcols = numeric_cols]
# Not rounding the numbers here as they will be done in the next step
# Remove temporary objects
rm(abs_cols, adjusted_cols, i, numeric_cols, ratio_long, ratio_table, ratio_col, ratio_cols, sum_cols, valid_cols)
# all age groups collectively should match m_all, f_all, or p_all
# males and females of each age group should match p_all
correct_population_sums <- function(data, columns_to_fix, column_to_refer) {
# Calculate row-wise sum of selected columns
data[, sum := rowSums(.SD, na.rm = TRUE), .SDcols = columns_to_fix]
# Compute ratio, ensuring no division by zero
data[, ratio := fifelse(get(column_to_refer) == 0, 1, sum / get(column_to_refer))]
# Handle cases where total population is zero
data[get(column_to_refer) == 0, (columns_to_fix) := 0]
data[get(column_to_refer) == 0, ratio := 1]
# Divide each selected column by ratio and round to the nearest integer
data[, (columns_to_fix) := lapply(.SD, function(x) round(x / ratio)), .SDcols = columns_to_fix]
# Calculate new sum and ratio after adjustments
data[, sum_new := rowSums(.SD, na.rm = TRUE), .SDcols = columns_to_fix]
data[, ratio_new := fifelse(get(column_to_refer) == 0, 1, sum_new / get(column_to_refer))]
# Replace Inf and NaN values before plotting
data[is.infinite(ratio_new) | is.na(ratio_new), ratio_new := 1]
# Only plot if there are valid values to plot
deviations <- data[ratio_new < 0.99 | ratio_new > 1.01]$ratio_new
if (length(deviations) > 0) {
plot(deviations, main = "Deviations in Ratio", ylab = "Ratio New", xlab = "Index")
} else {
print("No significant deviations found.")
}
# Count number of affected rows
bad_rows <- nrow(data[ratio_new < 0.99 | ratio_new > 1.01])
# Print deviation statistics
print(paste0("Number of >1% deviations used to update referral column: ", bad_rows))
print(paste0("This is ", format((bad_rows / nrow(data)) * 100, digits = 3), "% of ", nrow(data), " rows"))
if (bad_rows > 0) {
print(paste0("With a mean of ", format(mean(deviations, na.rm = TRUE), digits = 3)))
}
# Adjust 'all' column based on new sum
data[, (column_to_refer) := sum_new]
# Replace any remaining NaN values with 0
data[is.na(data)] <- 0
# Remove temporary columns
data[, c("sum", "ratio", "sum_new", "ratio_new") := NULL]
return(data)
}
# Step 1: Make sure p_all = all age groups
erp_backcasting <- correct_population_sums(
data = erp_backcasting,
columns_to_fix = paste0(grep("p_", names(erp_backcasting), value = TRUE)[1:18], "_adjusted"),
column_to_refer = "p_all_adjusted"
)
[1] "Number of >1% deviations used to update referral column: 10"
[1] "This is 0.0632% of 15834 rows"
[1] "With a mean of 1"
# Step 2: m_all + f_all should be made == p_all
erp_backcasting <- correct_population_sums(
data = erp_backcasting,
columns_to_fix = c("m_all_adjusted", "f_all_adjusted"),
column_to_refer = "p_all_adjusted"
)
[1] "No significant deviations found."
[1] "Number of >1% deviations used to update referral column: 0"
[1] "This is 0% of 15834 rows"
# Step 3: Individual age groups of each sex should be made equal to sum of all age groups of the respective sex
erp_backcasting <- correct_population_sums(
data = erp_backcasting,
columns_to_fix = paste0(grep("m_", names(erp_backcasting), value = TRUE)[1:18], "_adjusted"),
column_to_refer = "m_all_adjusted"
)
[1] "Number of >1% deviations used to update referral column: 26"
[1] "This is 0.164% of 15834 rows"
[1] "With a mean of 0.998"
[1] "Number of >1% deviations used to update referral column: 48"
[1] "This is 0.303% of 15834 rows"
[1] "With a mean of 0.996"
# Step 4: Make sure male and female in each age group = person of that age group
# Now that all major revisions have been made, we can change age groups of persons based on sum of age groups of sex
for (i in 1:18) {
erp_backcasting[, paste0(grep("p_", names(erp_backcasting), value = TRUE)[i], "_adjusted") :=
rowSums(.SD), .SDcols = c(
paste0(grep("m_", names(erp_backcasting), value = TRUE)[i], "_adjusted"),
paste0(grep("f_", names(erp_backcasting), value = TRUE)[i], "_adjusted"))]
}
# Remove all columns except Year, STATE, LGA_NAME and _adjust columns
cols_to_remove <- setdiff(names(erp_backcasting),
c("Year", "STATE", "LGA_NAME",
grep("_adjust", names(erp_backcasting), value = TRUE)))
erp_backcasting[, (cols_to_remove) := NULL]
# Remove all _adjusted at the end of column names
setnames(erp_backcasting,
old = grep("_adjusted$", names(erp_backcasting), value = TRUE),
new = gsub("_adjusted$", "", grep("_adjusted$", names(erp_backcasting), value = TRUE)))
rm(correct_population_sums, cols_to_remove)
b_erp_backcasting <- copy(erp_backcasting)
# Recalculate like we did when we tethered the historical population the first time
# Form columns that contains sums of age group column values for each year and STATE column without considering LGA_NAME column
# Summarise by Year and STATE
# Select only the required numeric columns
numeric_cols <- grep("^m_|^f_|^p_", names(erp_backcasting), value = TRUE)
# Perform aggregation
erp_summarised <- erp_backcasting[, lapply(.SD, sum, na.rm = TRUE), by =
.(Year, STATE), .SDcols = numeric_cols]
# Rename columns by appending "_sum"
setnames(erp_summarised, old = names(erp_summarised)[-c(1,2)],
new = paste0(names(erp_summarised)[-c(1,2)], "_sum"))
# Merge erp_summarised with erp_backcasting based on Year and STATE columns
erp_backcasting <- merge(erp_backcasting, erp_summarised, by = c("Year", "STATE"), all.x = TRUE)
# Merge erp_state_1995_2023 with erp_backcasting based on Year and STATE columns
erp_backcasting <- merge(erp_backcasting, erp_state_1995_2023, by = c("Year", "STATE"), all.x = TRUE)
# Identify the sum and abs columns
sum_cols <- grep("_sum$", names(erp_backcasting), value = TRUE)
abs_cols <- sub("_sum$", "_abs", sum_cols) # Get corresponding _abs columns
# Ensure all abs_cols exist before dividing
valid_cols <- abs_cols %in% names(erp_backcasting)
# Create ratio columns
for (i in which(valid_cols)) {
ratio_col <- sub("_sum$", "_ratio", sum_cols[i]) # Define new column name
erp_backcasting[, (ratio_col) := get(abs_cols[i]) / get(sum_cols[i])]
}
# Make all _ratio column values 1 for Year column values 1995:2000
# Identify all _ratio columns
ratio_cols <- grep("_ratio$", names(erp_backcasting), value = TRUE)
# Set _ratio values to 1 for Year between 1995 and 2000
erp_backcasting[Year <= 2000][, (ratio_cols) := 1]
# Visualise the ratios
# make a new dt with only c("Year", "STATE", ratio_cols) columns of erp_backcasting
ratio_table <- erp_backcasting[, c("Year", "STATE", ratio_cols), with = FALSE]
ratio_table <- unique(ratio_table)
# Convert to long format for ggplot
ratio_long <- melt(ratio_table, id.vars = c("Year", "STATE"), variable.name = "Category", value.name = "Ratio")
# ratio_long <- ratio_long[STATE == "New South Wales"]
# ratio_long <- ratio_long[STATE == "Victoria"]
ratio_long <- ratio_long[STATE == "Queensland"]
# ratio_long <- ratio_long[STATE == "South Australia"]
# ratio_long <- ratio_long[STATE == "Western Australia"]
# ratio_long <- ratio_long[STATE == "Tasmania"]
# ratio_long <- ratio_long[STATE == "Northern Territory"]
# ratio_long <- ratio_long[STATE == "Australian Capital Territory"]
# Plot heatmap
ggplotly(ggplot(ratio_long, aes(x = Year, y = Category, fill = Ratio)) +
geom_tile() +
scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 1) +
labs(title = "How well extrapolated estimates align with ABS state level estimates after adjustments?",
x = "Year", y = "Category", fill = "Ratio") +
theme_minimal()) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'How well extrapolated estimates align with ABS state level estimates after adjustments'))
ggplotly(ggplot(ratio_long, aes(x = Year, y = Ratio, color = Category)) +
geom_jitter(width = 0.2, alpha = 0.7) + # Adds small noise to prevent overlap
theme_minimal() +
labs(title = "Ratio of Queensland state-level:LGA-level estimates for 1995-2023",
y = "Ratio", x = "Year")) %>%
layout(yaxis = list(range = c(0.7, 1.3), tickvals = list(0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3), ticktext = list(0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3))) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'Ratio of Queensland state-level:LGA-level estimates for 1995-2023'))
ggplotly(ggplot(ratio_long, aes(x = Year, y = Ratio, color = Category)) +
geom_jitter(width = 0.2, alpha = 0.7) + # Adds small noise to prevent overlap
theme_minimal() +
labs(title = "Ratio of Queensland state-level:LGA-level estimates for 1995-2023 (zoomed)",
y = "Ratio", x = "Year")) %>%
# layout(yaxis = list(range = c(0.8, 1.1), tickvals = list(0.8, 0.9, 1, 1.1), ticktext = list(0.8, 0.9, 1, 1.1))) %>%
config(toImageButtonOptions = list(
format = 'svg', scale = 1, height = 1000, width = 1000,
filename = 'Ratio of Queensland state-level:LGA-level estimates for 1995-2023 (zoomed)'))
erp_lga_1995_2023_asgs2021 <- copy(erp_backcasting)
rm(erp_backcasting)
# Add a new column LGA_CODE to erp_lga_1995_2023_asgs2021 using mapping with LGA_NAME column of erp_lga_2001_2023_asgs2021_m
erp_lga_2001_2023_asgs2021_m <- as.data.table(read_excel("ABS data/erp_lga_2001_2023_asgs2021.xlsx",
sheet = "Table 1", skip = 6))
setnames(erp_lga_2001_2023_asgs2021_m, old = "LGA code", new = "LGA_CODE")
setnames(erp_lga_2001_2023_asgs2021_m, old = "LGA name", new = "LGA_NAME")
erp_lga_2001_2023_asgs2021_m <- erp_lga_2001_2023_asgs2021_m[,.(LGA_CODE, LGA_NAME)]
erp_lga_2001_2023_asgs2021_m <- unique(erp_lga_2001_2023_asgs2021_m)
erp_lga_1995_2023_asgs2021 <- merge(erp_lga_1995_2023_asgs2021, erp_lga_2001_2023_asgs2021_m, by = "LGA_NAME", all.x = TRUE)
rm(erp_lga_2001_2023_asgs2021_m)
# Rearrange columns
setcolorder(erp_lga_1995_2023_asgs2021, c("Year", "STATE", "LGA_NAME", "LGA_CODE",
grep("^m_", names(erp_lga_1995_2023_asgs2021), value = TRUE),
grep("^f_", names(erp_lga_1995_2023_asgs2021), value = TRUE),
grep("^p_", names(erp_lga_1995_2023_asgs2021), value = TRUE)))
write.csv(erp_lga_1995_2023_asgs2021, "Population estimates data/5-year age groups/Yearly/erp_lga_1995_2023_asgs2021.csv", row.names = FALSE)
# Remove STATE column
erp_lga_1995_2023_asgs2021[, STATE := NULL]
# Remove LGA_CODE column
erp_lga_1995_2023_asgs2021[, LGA_CODE := NULL]
Correspondence for converting 2021 LGA to 2016 LGA is available from:
Australian Bureau of Statistics. (2021). ASGS Geographic Correspondences (2021) Edition 3. ABS. https://data.gov.au/data/dataset/asgs-edition-3-2021-correspondences.
# Load correspondence file
cg_lga_2016_lga_2021 <- as.data.table(read.csv('ABS data/CG_LGA_2016_LGA_2021.csv'))
# Convert "SA1_CODE_2021" and "LGA_CODE_2021" to numeric
cg_lga_2016_lga_2021[, LGA_CODE_2016 := as.numeric(LGA_CODE_2016)]
cg_lga_2016_lga_2021[, LGA_CODE_2021 := as.numeric(LGA_CODE_2021)]
# Inspecting the quality of correspondence
plot(table(cg_lga_2016_lga_2021$INDIV_TO_REGION_QLTY_INDICATOR))
# Ensure column names match for merging
setnames(cg_lga_2016_lga_2021, "LGA_CODE_2021", "LGA_CODE")
# Add a new column LGA_CODE to erp_lga_1995_2023_asgs2021 using mapping with LGA_NAME column of erp_lga_2001_2023_asgs2021_m
erp_lga_2001_2023_asgs2021_m <- as.data.table(read_excel("ABS data/erp_lga_2001_2023_asgs2021.xlsx",
sheet = "Table 1", skip = 6))
setnames(erp_lga_2001_2023_asgs2021_m, old = "LGA code", new = "LGA_CODE")
setnames(erp_lga_2001_2023_asgs2021_m, old = "LGA name", new = "LGA_NAME")
erp_lga_2001_2023_asgs2021_m <- erp_lga_2001_2023_asgs2021_m[,.(LGA_CODE, LGA_NAME)]
erp_lga_2001_2023_asgs2021_m <- unique(erp_lga_2001_2023_asgs2021_m)
erp_lga_1995_2023_asgs2021 <- merge(erp_lga_1995_2023_asgs2021, erp_lga_2001_2023_asgs2021_m, by = "LGA_NAME", all.x = TRUE)
rm(erp_lga_2001_2023_asgs2021_m)
# Merge lga population data with the correspondence table
merged_data <- merge(erp_lga_1995_2023_asgs2021,
cg_lga_2016_lga_2021[, .(`LGA_CODE`, LGA_CODE_2016, LGA_NAME_2016, RATIO_FROM_TO)],
by = "LGA_CODE", all.x = TRUE, allow.cartesian = TRUE)
# Identify population columns
pop_cols <- setdiff(names(erp_lga_1995_2023_asgs2021), c("Year", "LGA_CODE", "LGA_NAME"))
# Scale population estimates using RATIO_FROM_TO
for (col in pop_cols) {
merged_data[, (col) := get(col) * RATIO_FROM_TO]
}
# Aggregate data at the LGA (2016 lga) level
erp_lga_1995_2023_asgs2016 <- merged_data[, lapply(.SD, sum, na.rm = TRUE),
by = .(Year, LGA_CODE_2016, LGA_NAME_2016), .SDcols = pop_cols]
# Round population to nearest integer
for (col in pop_cols) {
erp_lga_1995_2023_asgs2016[, (col) := round(get(col))]
}
# Making sure totals make sense
# all age groups collectively should match m_all, f_all, or p_all
# males and females of each age group should match p_all
# This time, instead of forcing all age groups to match with p_all (top down approach),
# a bottom up approach will be implemented forcing p_all to be the sum of all age groups.
# Because, for SA2s we had the state population as a benchmark, whereas we are using correspondence data this time.
# Total = sum of all age groups
age_cols <- c("00_04", "05_09", "10_14", "15_19","20_24", "25_29", "30_34", "35_39", "40_44",
"45_49", "50_54", "55_59", "60_64", "65_69", "70_74", "75_79", "80_84", "85_99")
erp_lga_1995_2023_asgs2016[, m_all := rowSums(.SD), .SDcols = paste0("m_", age_cols)]
erp_lga_1995_2023_asgs2016[, f_all := rowSums(.SD), .SDcols = paste0("f_", age_cols)]
erp_lga_1995_2023_asgs2016[, p_all := rowSums(.SD), .SDcols = paste0("p_", age_cols)]
# Persons = males + females
for (age in age_cols) {
erp_lga_1995_2023_asgs2016[, (paste0("p_", age)) := rowSums(.SD), .SDcols = c(paste0("m_", age), paste0("f_", age))]
}
erp_lga_1995_2023_asgs2016[, p_all := rowSums(.SD), .SDcols = c("m_all", "f_all")]
# Rename columns for clarity
setnames(erp_lga_1995_2023_asgs2016, c("LGA_CODE_2016", "LGA_NAME_2016"), c("LGA_CODE", "LGA_NAME"))
# Remove rows with NA LGA_NAME
erp_lga_1995_2023_asgs2016 <- erp_lga_1995_2023_asgs2016[!is.na(LGA_NAME)]
# Form STATE column using LGA_CODE
erp_lga_1995_2023_asgs2016[, `LGA_CODE_1` := substr(LGA_CODE, 1, 1)]
erp_lga_1995_2023_asgs2016[, `LGA_CODE_1` := as.numeric(LGA_CODE_1)]
erp_lga_1995_2023_asgs2016 <- erp_lga_1995_2023_asgs2016[LGA_CODE_1 != 9] # Removing all Other Territories as their data is not available from state level
erp_lga_1995_2023_asgs2016[, STATE := fcase(LGA_CODE_1 == 1, "New South Wales",
LGA_CODE_1 == 2, "Victoria",
LGA_CODE_1 == 3, "Queensland",
LGA_CODE_1 == 4, "South Australia",
LGA_CODE_1 == 5, "Western Australia",
LGA_CODE_1 == 6, "Tasmania",
LGA_CODE_1 == 7, "Northern Territory",
LGA_CODE_1 == 8, "Australian Capital Territory")]
erp_lga_1995_2023_asgs2016[,LGA_CODE_1 := NULL]
# Rearrange columns
setcolorder(erp_lga_1995_2023_asgs2016, c("Year", "STATE", "LGA_NAME", "LGA_CODE",
grep("^m_", names(erp_lga_1995_2023_asgs2016), value = TRUE),
grep("^f_", names(erp_lga_1995_2023_asgs2016), value = TRUE),
grep("^p_", names(erp_lga_1995_2023_asgs2016), value = TRUE)))
# Save lga estimates
write.csv(erp_lga_1995_2023_asgs2016, "Population estimates data/5-year age groups/Yearly/erp_lga_1995_2023_asgs2016.csv", row.names = FALSE)
# Remove temporary objects
rm(cg_lga_2016_lga_2021, merged_data, col, pop_cols, age_cols, age)
dt <- copy(erp_lga_1995_2023_asgs2016)
# Subset data for 20 random SA2_NAMEs
set.seed(12345)
lga_names <- sample(unique(dt$`LGA_NAME`), 100)
dt <- dt[`LGA_NAME` %in% lga_names]
# dt <- dt[grepl("brisbane", `LGA_NAME`, ignore.case = T)]
plot_ly(
data = dt,
# data = dt_agg,
x = ~Year,
y = ~m_all,
# y = ~f_all,
# y = ~p_all,
color = ~`LGA_NAME`, # Group by LGA_NAME
type = 'scatter',
mode = 'lines', # Line plot
hoverinfo = 'text',
text = ~paste("Year:", Year
, "<br>LGA:", `LGA_NAME`
, "<br>p_all:", p_all)
) %>%
layout(
title = "Population (p_all) Over Time by LGA",
xaxis = list(title = "Year"),
yaxis = list(title = "p_all"),
legend = list(title = list(text = "LGA_NAME"))
)
# Aggregate by year column and sum all columns disregarding LGA_NAME and LGA_CODE
dt_agg <- copy(erp_lga_1995_2023_asgs2016)
dt_agg <- dt_agg[, LGA_NAME := NULL]
dt_agg <- dt_agg[, LGA_CODE := NULL]
dt_agg <- dt_agg[, STATE := NULL]
dt_agg <- dt_agg[, lapply(.SD, sum), by = Year]
dt_agg <- melt(dt_agg, id.vars = "Year", variable.name = "Category", value.name = "Value")
# Plot using Plotly
plot_ly(data = dt_agg, x = ~Year, y = ~Value, color = ~Category,
type = "scatter", mode = "lines") %>%
layout(title = "Trends Over Years",
xaxis = list(title = "Year"),
yaxis = list(title = "Count"))
# Remove temporary objects
rm(lga_names, dt, dt_agg)
If you see mistakes or want to suggest changes, please create an issue on the source repository.
Text and figures are licensed under Creative Commons Attribution CC BY-SA 4.0. Source code is available at https://github.com/ctrl-shift-vs/Population-estimates, unless otherwise noted. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".
For attribution, please cite this work as
Singh, et al. (2025, July 7). Population estimates (LGA). Retrieved from https://github.com/ctrl-shift-vs/Population-estimates
BibTeX citation
@misc{singh2025population,
author = {Singh, Vishal and Cramb, Susanna and Cortes-Ramirez, Javier},
title = {Population estimates (LGA)},
url = {https://github.com/ctrl-shift-vs/Population-estimates},
year = {2025}
}